library(GSVA)
library(survival)
library(stringi)
library(survminer)
library(pheatmap)
library(dplyr)
library(binr)
library(ggplot2)
library(ggpubr)
library(reshape2)
library(tidyverse)
library(limma)
library(edgeR)
library(TCGAutils)
library(biomaRt)
library(org.Hs.eg.db)
library(ggrepel)
library(fgsea)
library(gridExtra)
library(fgsea)
library(pheatmap)
library(org.Hs.eg.db)
load("~/Documents/Elemento/AgeTCGA-master/DATA/cBioportal_Survival_Data/age_metadata.RData")
load("~/Documents/Elemento/AgeTCGA-master/DATA/NewTCGAcounts/Full Matrix.RData")
## This function will be called to bin age groups
age.cat <- function(x, lower = 1, upper, by = 10,
sep = "-", above.char = "+") {
labs <- c(paste(seq(lower, upper - by, by = by),
seq(lower + by - 1, upper - 1, by = by),
sep = sep),
paste(upper, above.char, sep = ""))
cut(floor(x), breaks = c(seq(lower, upper, by = by), Inf),
right = FALSE, labels = labs)
}
Make sure metadata and count data match up. Select breast cancer samples that are Q1 or Q4.
filtered_counts <- RNAseq_counts
## include primary tumors only
filtered_counts <- filtered_counts[ ,(TCGAbiospec(colnames(filtered_counts))$sample_definition == "Primary Solid Tumor" |
TCGAbiospec(colnames(filtered_counts))$sample_definition == "Primary Blood Derived Cancer - Peripheral Blood")]
## remove duplicates
filtered_counts <- filtered_counts[,-c(which(duplicated(TCGAbiospec(colnames(filtered_counts))$submitter_id)))]
## making sure both datasets have the same case id format
colnames(filtered_counts) <- substr(colnames(filtered_counts), start = 1, stop = 15)
## remove CASE ID factors from age metadata
## keep breast cancer Q1 and Q4
Age_OnyCT.meta$CASE_ID <- as.character(Age_OnyCT.meta$CASE_ID)
Age_OnyCT.meta <- Age_OnyCT.meta[which(Age_OnyCT.meta$TCGA_CancerCode == "brca" & Age_OnyCT.meta$Subtype_Quartile != "Q2-Q3"),]
Age_OnyCT.meta <- na.omit(Age_OnyCT.meta)
## selecting case IDs common to both datasets and discarding the rest
common_cases <- intersect(colnames(filtered_counts), Age_OnyCT.meta$CASE_ID)
filtered_age <- Age_OnyCT.meta[Age_OnyCT.meta$CASE_ID %in% common_cases,]
filtered_counts <- filtered_counts[,colnames(filtered_counts) %in% common_cases]
## sorting case IDs
filtered_age <- filtered_age[order(filtered_age$CASE_ID),]
filtered_counts <- filtered_counts[,order(colnames(filtered_counts))]
## create metadata
input <- as.data.frame(filtered_age[,c("CASE_ID","Subtype_Quartile")])
input <- na.omit(input)
input$Subtype_Quartile <- factor(input$Subtype_Quartile, levels = c("Q4","Q1"))
condition <- c("Q4","Q1")
comparison <- "Q4-Q1"
## create design and contrast matrices
design <- model.matrix(~0+ input$Subtype_Quartile)
colnames(design) <- condition
contmatrix <- makeContrasts(as.character(comparison),levels=design)
## filter low counts
filtered_counts<-filtered_counts[which(rowSums(cpm(filtered_counts)> 2) >= 2) ,]
## model fitting
Data.voom <- voom(filtered_counts,plot=FALSE)
fit <- lmFit(Data.voom,design)
fit2 <- contrasts.fit(fit,contmatrix)
fit2 <-eBayes(fit2)
tt <- topTable(fit2,n=Inf)
## convert ENSEMBL IDs to gene symbols
tt[,7] <- mapIds(org.Hs.eg.db, keys = rownames(tt), keytype = "ENSEMBL", column="SYMBOL", "first")
tt <- na.omit(tt)
tt <- tt[-which(duplicated(tt$V7)),]
rownames(tt) <- tt$V7
Create signatures from differentially expressed genes. These signatures are created based on logFC and FDR.
## four gene lists based on fdr and logFC
brca_age_up <- rownames(tt)[which(tt$adj.P.Val < 0.05 &tt$logFC>0)]
brca_age_down <- rownames(tt)[which(tt$adj.P.Val < 0.05 &tt$logFC<0)]
brca_age_up_logFC_1 <- rownames(tt)[which(tt$adj.P.Val < 0.05 &tt$logFC>1)]
brca_age_down_logFC_1 <- rownames(tt)[which(tt$adj.P.Val < 0.05 &tt$logFC < -1)]
brca_aging_geneset <- list(brca_age_up = brca_age_up,
brca_age_down = brca_age_down,
brca_age_up_logFC_1 = brca_age_up_logFC_1,
brca_age_down_logFC_1 = brca_age_down_logFC_1)
Make sure metadata and count data match up. Select all breast cancer samples.
load("~/Documents/Elemento/AgeTCGA-master/DATA/cBioportal_Survival_Data/age_metadata.RData")
filtered_counts <- RNAseq_counts
## include primary tumors only
filtered_counts <- filtered_counts[ ,(TCGAbiospec(colnames(filtered_counts))$sample_definition == "Primary Solid Tumor" |
TCGAbiospec(colnames(filtered_counts))$sample_definition == "Primary Blood Derived Cancer - Peripheral Blood")]
## remove duplicates
filtered_counts <- filtered_counts[,-c(which(duplicated(TCGAbiospec(colnames(filtered_counts))$submitter_id)))]
## making sure both datasets have the same case id format
colnames(filtered_counts) <- substr(colnames(filtered_counts), start = 1, stop = 15)
## remove CASE ID factors from age metadata
## keep breast cancer
Age_OnyCT.meta$CASE_ID <- as.character(Age_OnyCT.meta$CASE_ID)
Age_OnyCT.meta <- Age_OnyCT.meta[which(Age_OnyCT.meta$TCGA_CancerCode == "brca"),]
Age_OnyCT.meta <- na.omit(Age_OnyCT.meta)
## selecting case IDs common to both datasets and discarding the rest
common_cases <- intersect(colnames(filtered_counts), Age_OnyCT.meta$CASE_ID)
filtered_age <- Age_OnyCT.meta[Age_OnyCT.meta$CASE_ID %in% common_cases,]
filtered_counts <- filtered_counts[,colnames(filtered_counts) %in% common_cases]
## sorting case IDs
filtered_age <- filtered_age[order(filtered_age$CASE_ID),]
filtered_counts <- filtered_counts[,order(colnames(filtered_counts))]
rm("RNAseq_counts")
## conver ENSEMBL IDs to gene symbols
filtered_counts[,ncol(filtered_counts)+1] <- mapIds(org.Hs.eg.db, keys = rownames(filtered_counts),
keytype = "ENSEMBL", column="SYMBOL", "first")
## 'select()' returned 1:many mapping between keys and columns
filtered_counts <- na.omit(filtered_counts)
filtered_counts <- filtered_counts[-which(duplicated(filtered_counts[,ncol(filtered_counts)])),]
rownames(filtered_counts) <- filtered_counts[,ncol(filtered_counts)]
filtered_counts <- filtered_counts[,-ncol(filtered_counts)]
Score signatures across the entire breast cancer dataset. Ideally, enrichment in older age groups should be equal to that in middle aged groups. This will suggest that the middle age groups have accelerated aging.
I have binned actual ages from 21-30, 31-40 … because the minimum and maximum ages are 26 and 90 respectively.
set.seed(2019)
## generate signature
brca_ssgsea <- gsva(as.matrix(filtered_counts), brca_aging_geneset, "Poisson", "ssgsea")
## Estimating ssGSEA scores for 4 gene sets.
##
|
| | 0%Using parallel with 4 cores
##
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|====================== | 35%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 45%
|
|============================== | 46%
|
|============================== | 47%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|=================================== | 55%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|============================================== | 72%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================ | 75%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|================================================== | 78%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================= | 95%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
## check if the order is the same
all(colnames(brca_ssgsea) == filtered_age$CASE_ID)
## [1] TRUE
## correlate age with signature
print(cor.test(brca_ssgsea[1,], filtered_age$AGE)) # aging upregulated genes
##
## Pearson's product-moment correlation
##
## data: brca_ssgsea[1, ] and filtered_age$AGE
## t = 10.17, df = 946, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2553701 0.3702083
## sample estimates:
## cor
## 0.3139369
print(cor.test(brca_ssgsea[2,], filtered_age$AGE)) # aging downregulated genes
##
## Pearson's product-moment correlation
##
## data: brca_ssgsea[2, ] and filtered_age$AGE
## t = -10.852, df = 946, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3881659 -0.2748693
## sample estimates:
## cor
## -0.3327177
print(cor.test(brca_ssgsea[3,], filtered_age$AGE)) # aging upregulated genes with logFC > 1
##
## Pearson's product-moment correlation
##
## data: brca_ssgsea[3, ] and filtered_age$AGE
## t = 9.0897, df = 946, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2237798 0.3409321
## sample estimates:
## cor
## 0.283413
print(cor.test(brca_ssgsea[4,], filtered_age$AGE)) # aging downregulated genes with logFC < -1
##
## Pearson's product-moment correlation
##
## data: brca_ssgsea[4, ] and filtered_age$AGE
## t = -10.069, df = 946, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3675248 -0.2524643
## sample estimates:
## cor
## -0.3111342
brca_ssgsea <- data.frame(t(brca_ssgsea), stringsAsFactors = F)
## add actual age and age quartile information to the signature DF
brca_ssgsea <- data.frame(cbind(brca_ssgsea),
filtered_age$AGE,
filtered_age$Subtype_Quartile,
stringsAsFactors = FALSE)
colnames(brca_ssgsea)[5:6] <- c("age","quartiles")
## Bin ages into 10-year groups
binned_age <- mutate(brca_ssgsea, Age_Bins = age.cat(brca_ssgsea$age, upper = 100))
binned_age_plot <- melt(binned_age[,-5])
## Plot the signature across age bins
## Try with the oldest group as reference
ggboxplot(binned_age_plot, x = 'Age_Bins', y = 'value', fill = 'Age_Bins',
add = "jitter", facet.by = 'variable', main = "brca aging ssGSEA enrichment",
subtitle = "Ref. group: 81-90") +
stat_compare_means(method = "wilcox", ref.group = "81-90",label = "p.signif", hide.ns = TRUE)
## Try with the second oldest group as reference
ggboxplot(binned_age_plot, x = 'Age_Bins', y = 'value', fill = 'Age_Bins',
add = "jitter", facet.by = 'variable', main = "brca aging ssGSEA enrichment",
subtitle = "Ref. group: 71-80") +
stat_compare_means(method = "wilcox", ref.group = "71-80",label = "p.signif", hide.ns = TRUE)
## Try with the third oldest group as reference
ggboxplot(binned_age_plot, x = 'Age_Bins', y = 'value', fill = 'Age_Bins',
add = "jitter", facet.by = 'variable', main = "brca aging ssGSEA enrichment",
subtitle = "Ref. group: 61-70") +
stat_compare_means(method = "wilcox", ref.group = "61-70",label = "p.signif", hide.ns = TRUE)
All signatures have a statistically significant absolute correlation of ~0.3.
All older age groups enrichments seem to be significantly different from younger ones.
Accelerated aging is not captured in this analysis.
Load clinical data and merge it with the signature score DF.
## load clinical data
brca_meta <- read.csv("~/Documents/Elemento/AgeTCGA-master/DATA/cBioportal_Survival_Data/Clinical_Data_brca_tcga_all.txt", stringsAsFactors = FALSE,
header = T, sep = "\t")
## create a single DF that contains enrichment, age metadata and cbioportal clinical data
brca_meta <- merge(Age_OnyCT.meta, brca_meta, "CASE_ID")
brca_ssgsea[,ncol(brca_ssgsea)+1] <- rownames(brca_ssgsea)
colnames(brca_ssgsea)[ncol(brca_ssgsea)] <- "CASE_ID"
## this is the final DF that we will use
CancerType.sel <- merge(brca_ssgsea, brca_meta, "CASE_ID")
## divide our variable of interest into quantiles
# CancerType.sel <- mutate(CancerType.sel, SignatureScore = ntile(CancerType.sel$brca_age_down_logFC_1, 3)) ## divine signature into ntiles
# CancerType.sel <- mutate(CancerType.sel, AgeScore = ntile(CancerType.sel$brca_age_up_logFC_1, 3)) # divide age into ntiles
Run cox regression on extreme quantiles of the age signature as well as actual age to check for overall survival benefits.
Only one of the four signatures (genes downregulated with aging with logFC < -1) shows any significance, so I’ve used that for the regression model.
CancerType.sel$OS_MONTHS = as.numeric(as.character(CancerType.sel$OS_MONTHS))
CancerType.sel$event = ifelse(CancerType.sel$OS_STATUS =="LIVING",0,1)
sig_names <- c("2 quantiles", "3 quantiles",
"4 quantiles", "5 quantiles")
## save KM plots to a list
sigPlotx <- list(0,0,0,0)
names(sigPlotx) <- c(sig_names)
## loop to generate KM plots for 2,3,4 and 5 quantiles
for (i in 2:5) {
## divide the signature into quantiles
CancerType.sel <- mutate(CancerType.sel, SignatureScore = ntile(CancerType.sel$brca_age_down_logFC_1, i)) ## divine signature into ntiles
## model fit
surv_result = summary(coxph(Surv(time=OS_MONTHS ,event = event ) ~ SignatureScore,
data = CancerType.sel[which(CancerType.sel$SignatureScore == 1 |
CancerType.sel$SignatureScore == i),]))
fit <- survfit(Surv(time= OS_MONTHS,event = event )~ SignatureScore,
data = CancerType.sel[which(CancerType.sel$SignatureScore == 1 |
CancerType.sel$SignatureScore == i),])
## get model details
Surv_pval <- coef(surv_result)[1,5]
hazard_ratio = round(surv_result$coefficients[2],5)
Surv_N <- surv_result$n
Surv_events <- surv_result$nevent
## Plot using survminer
sigPlotx[[i-1]] <- ggsurvplot(fit,
data = CancerType.sel,
risk.table = FALSE,
size = 1,
conf.int = FALSE,
pval = TRUE,
palette = c("dodgerblue2","firebrick2"),
legend.labs = c("Old", "Young"),
title = paste("OS: Signature",sig_names[i-1], sep = " ")
)
}
## Similar code to look at actual age quantiles
age_names <- c("2 quantiles", "3 quantiles",
"4 quantiles", "5 quantiles")
## save KM plots to a list
agePlotx <- list(0,0,0,0)
names(agePlotx) <- c(agePlotx)
## loop to generate KM plots for 2,3,4 and 5 quantiles
for (i in 2:5) {
## divide age into quantiles
CancerType.sel <- mutate(CancerType.sel, AgeScore = ntile(CancerType.sel$age, i))
## model fit
surv_result = summary(coxph(Surv(time=OS_MONTHS ,event = event ) ~ AgeScore,
data = CancerType.sel[which(CancerType.sel$AgeScore == 1 |
CancerType.sel$AgeScore == i),]))
fit <- survfit(Surv(time= OS_MONTHS,event = event )~ AgeScore,
data = CancerType.sel[which(CancerType.sel$AgeScore == 1 |
CancerType.sel$AgeScore == i),])
## get model details
Surv_pval <- coef(surv_result)[1,5]
hazard_ratio = round(surv_result$coefficients[2],5)
Surv_N <- surv_result$n
Surv_events <- surv_result$nevent
## Plot using survminer
agePlotx[[i-1]] <- ggsurvplot(fit,
data = CancerType.sel,
risk.table = FALSE,
size = 1,
conf.int = FALSE,
pval = TRUE,
palette = c("firebrick2","dodgerblue2"),
legend.labs = c("Young", "Old"),
ggtheme = theme_gray(),
title = paste("OS: Actual Age",sig_names[i-1], sep = " ")
)
}
## View the KM plots
arrange_ggsurvplots(agePlotx)
arrange_ggsurvplots(sigPlotx)
Run cox regression on extreme quantiles of the age signature as well as actual age to check for overall survival benefits.
Nither actual age, nor signature quantiles show any significant association with disease free survival.
CancerType.sel$DFS_MONTHS = as.numeric(as.character(CancerType.sel$DFS_MONTHS))
CancerType.sel$event = ifelse(CancerType.sel$DFS_STATUS =="DiseaseFree",0,1)
sig_names <- c("2 quantiles", "3 quantiles",
"4 quantiles", "5 quantiles")
## save KM plots to a list
sigPlotx <- list(0,0,0,0)
names(sigPlotx) <- c(sig_names)
## loop to generate KM plots for 2,3,4 and 5 quantiles
for (i in 2:5) {
## divide the signature into quantiles
CancerType.sel <- mutate(CancerType.sel, SignatureScore = ntile(CancerType.sel$brca_age_down_logFC_1, i)) ## divine signature into ntiles
## model fit
surv_result = summary(coxph(Surv(time=DFS_MONTHS ,event = event ) ~ SignatureScore,
data = CancerType.sel[which(CancerType.sel$SignatureScore == 1 |
CancerType.sel$SignatureScore == i),]))
fit <- survfit(Surv(time= DFS_MONTHS,event = event )~ SignatureScore,
data = CancerType.sel[which(CancerType.sel$SignatureScore == 1 |
CancerType.sel$SignatureScore == i),])
## get model details
Surv_pval <- coef(surv_result)[1,5]
hazard_ratio = round(surv_result$coefficients[2],5)
Surv_N <- surv_result$n
Surv_events <- surv_result$nevent
## Plot using survminer
sigPlotx[[i-1]] <- ggsurvplot(fit,
data = CancerType.sel,
risk.table = FALSE,
size = 1,
conf.int = FALSE,
pval = TRUE,
palette = c("dodgerblue2","firebrick2"),
legend.labs = c("Old", "Young"),
title = paste("DFS: Signature",sig_names[i-1], sep = " ")
)
}
## Similar code to look at actual age quantiles
age_names <- c("2 quantiles", "3 quantiles",
"4 quantiles", "5 quantiles")
## save KM plots to a list
agePlotx <- list(0,0,0,0)
names(agePlotx) <- c(agePlotx)
## loop to generate KM plots for 2,3,4 and 5 quantiles
for (i in 2:5) {
## divide age into quantiles
CancerType.sel <- mutate(CancerType.sel, AgeScore = ntile(CancerType.sel$age, i))
## model fit
surv_result = summary(coxph(Surv(time=DFS_MONTHS ,event = event ) ~ AgeScore,
data = CancerType.sel[which(CancerType.sel$AgeScore == 1 |
CancerType.sel$AgeScore == i),]))
fit <- survfit(Surv(time= DFS_MONTHS,event = event )~ AgeScore,
data = CancerType.sel[which(CancerType.sel$AgeScore == 1 |
CancerType.sel$AgeScore == i),])
## get model details
Surv_pval <- coef(surv_result)[1,5]
hazard_ratio = round(surv_result$coefficients[2],5)
Surv_N <- surv_result$n
Surv_events <- surv_result$nevent
## Plot using survminer
agePlotx[[i-1]] <- ggsurvplot(fit,
data = CancerType.sel,
risk.table = FALSE,
size = 1,
conf.int = FALSE,
pval = TRUE,
palette = c("firebrick2","dodgerblue2"),
legend.labs = c("Young", "Old"),
ggtheme = theme_gray(),
title = paste("DFS: Actual Age",sig_names[i-1], sep = " ")
)
}
## View the KM plots
arrange_ggsurvplots(agePlotx)
arrange_ggsurvplots(sigPlotx)